import os
import numpy as np
import pandas as pd
from glob import glob
import shutil
# image
import cv2
from skimage.io import imread
# TensorFlow
import tensorflow as tf
from tensorflow.keras import layers, models
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## progressbar
import progressbar
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
from matplotlib.font_manager import FontProperties
import matplotlib.colors as mcolors
from matplotlib.colors import LinearSegmentedColormap
from mpl_toolkits.axes_grid1.inset_locator import inset_axes
from matplotlib import cm
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")
The dataset is designed to allow for different methods to be tested for examining the trends in CT image data associated with using contrast and patient age. The basic idea is to identify image textures, statistical patterns and features correlating strongly with these traits and possibly build simple tools for automatically classifying these images when they have been misclassified (or finding outliers which could be suspicious cases, bad measurements, or poorly calibrated machines)
The data are a tiny subset of images from the cancer imaging archive. They consist of the middle slice of all CT images taken where valid age, modality, and contrast tags could be found. This results in 475 series from 69 different patients.
TCIA Archive Link - https://wiki.cancerimagingarchive.net/display/Public/TCGA-LUAD
Base = 'ct_medical_images'
TIF_path_dict = {os.path.basename(x): x for x in glob(os.path.join(Base, '*', '*.tif'))}
DCM_path_dict = {os.path.basename(x): x for x in glob(os.path.join(Base, '*', '*.dcm'))}
Data = pd.read_csv(os.path.join(Base,"overview.csv"), index_col = 0)
Data = Data.drop(columns = ['raw_input_path'])
Data['tiff Path'] = Data['tiff_name'].map(TIF_path_dict)
Data['dicom Path'] = Data['dicom_name'].map(DCM_path_dict)
Data['Contrast'] = Data['Contrast'].astype(str)
display(Data.head(10))
display(pd.DataFrame({'Number of Instances':[Data.shape[0]], 'Number of Attributes':[Data.shape[1]]}).style.hide_index())
| Age | Contrast | ContrastTag | id | tiff_name | dicom_name | tiff Path | dicom Path | |
|---|---|---|---|---|---|---|---|---|
| 0 | 60 | True | NONE | 0 | ID_0000_AGE_0060_CONTRAST_1_CT.tif | ID_0000_AGE_0060_CONTRAST_1_CT.dcm | ct_medical_images\tiff_images\ID_0000_AGE_0060... | ct_medical_images\dicom_dir\ID_0000_AGE_0060_C... |
| 1 | 69 | True | NONE | 1 | ID_0001_AGE_0069_CONTRAST_1_CT.tif | ID_0001_AGE_0069_CONTRAST_1_CT.dcm | ct_medical_images\tiff_images\ID_0001_AGE_0069... | ct_medical_images\dicom_dir\ID_0001_AGE_0069_C... |
| 2 | 74 | True | APPLIED | 2 | ID_0002_AGE_0074_CONTRAST_1_CT.tif | ID_0002_AGE_0074_CONTRAST_1_CT.dcm | ct_medical_images\tiff_images\ID_0002_AGE_0074... | ct_medical_images\dicom_dir\ID_0002_AGE_0074_C... |
| 3 | 75 | True | NONE | 3 | ID_0003_AGE_0075_CONTRAST_1_CT.tif | ID_0003_AGE_0075_CONTRAST_1_CT.dcm | ct_medical_images\tiff_images\ID_0003_AGE_0075... | ct_medical_images\dicom_dir\ID_0003_AGE_0075_C... |
| 4 | 56 | True | NONE | 4 | ID_0004_AGE_0056_CONTRAST_1_CT.tif | ID_0004_AGE_0056_CONTRAST_1_CT.dcm | ct_medical_images\tiff_images\ID_0004_AGE_0056... | ct_medical_images\dicom_dir\ID_0004_AGE_0056_C... |
| 5 | 48 | True | NONE | 5 | ID_0005_AGE_0048_CONTRAST_1_CT.tif | ID_0005_AGE_0048_CONTRAST_1_CT.dcm | ct_medical_images\tiff_images\ID_0005_AGE_0048... | ct_medical_images\dicom_dir\ID_0005_AGE_0048_C... |
| 6 | 75 | True | NONE | 6 | ID_0006_AGE_0075_CONTRAST_1_CT.tif | ID_0006_AGE_0075_CONTRAST_1_CT.dcm | ct_medical_images\tiff_images\ID_0006_AGE_0075... | ct_medical_images\dicom_dir\ID_0006_AGE_0075_C... |
| 7 | 61 | True | NONE | 7 | ID_0007_AGE_0061_CONTRAST_1_CT.tif | ID_0007_AGE_0061_CONTRAST_1_CT.dcm | ct_medical_images\tiff_images\ID_0007_AGE_0061... | ct_medical_images\dicom_dir\ID_0007_AGE_0061_C... |
| 8 | 51 | True | NaN | 8 | ID_0008_AGE_0051_CONTRAST_1_CT.tif | ID_0008_AGE_0051_CONTRAST_1_CT.dcm | ct_medical_images\tiff_images\ID_0008_AGE_0051... | ct_medical_images\dicom_dir\ID_0008_AGE_0051_C... |
| 9 | 48 | True | NONE | 9 | ID_0009_AGE_0048_CONTRAST_1_CT.tif | ID_0009_AGE_0048_CONTRAST_1_CT.dcm | ct_medical_images\tiff_images\ID_0009_AGE_0048... | ct_medical_images\dicom_dir\ID_0009_AGE_0048_C... |
| Number of Instances | Number of Attributes |
|---|---|
| 100 | 8 |
def DatasetDist(Table, Target, PD):
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, column_widths=PD['column_widths'],
specs=[[{"type": "table"},{"type": "pie"}]])
# Right
fig.add_trace(go.Pie(labels=Table[Target].values, values=Table['Count'].values,
pull=PD['pull'], textfont=dict(size= PD['textfont']),
marker=dict(colors = PD['PieColors'], line=dict(color='black', width=1))), row=1, col=2)
fig.update_traces(hole=PD['hole'])
fig.update_layout(height = PD['height'], legend=dict(orientation=PD['legend_orientation']),
legend_title_text= PD['legend_title'])
# Left
T = Table.copy()
T['Percentage'] = T['Percentage'].map(lambda x: '%%%.2f' % x)
Temp = []
for i in T.columns:
Temp.append(T.loc[:,i].values)
fig.add_trace(go.Table(header=dict(values = list(Table.columns), line_color='darkslategray',
fill_color= PD['TableColors'][0], align=['center','center'],
font=dict(color='white', size=12), height=25), columnwidth = PD['tablecolumnwidth'],
cells=dict(values=Temp, line_color='darkslategray',
fill=dict(color= [PD['TableColors'][1], PD['TableColors'][1]]),
align=['center','center', 'center'], font_size=12, height=20)), 1, 1)
fig.update_layout(title={'text': '<b>' + Target + '<b>', 'x':PD['title_x'],
'y':PD['title_y'], 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
def DistPlot(Feat, Target, PD, Inp = Data):
fig = px.histogram(Inp, x = Feat, nbins=PD['nbins'], color= Target, marginal = PD['marginal'],
color_discrete_sequence= PD['Bar_Colors'], hover_data=Data.columns)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=True, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray')
# Median
fig.add_trace(go.Scatter(x= Inp[Feat].median()* np.ones(int(PD['ylims'][1])),
y= np.arange(int(PD['ylims'][0]), int(PD['ylims'][1])),
name="Median", line=dict(color='RoyalBlue', width=2, dash='dot')))
# Mean
fig.add_trace(go.Scatter(x= Inp[Feat].mean()* np.ones(int(PD['ylims'][1])),
y= np.arange(int(PD['ylims'][0]), int(PD['ylims'][1])),
name="Mean", line=dict(color='Red', width=2, dash='dot')))
Name = '%s Distribution by %s' % (Target, Feat)
fig.update_layout(title={'text': '<b>' + PD['Title'] + '<b>', 'x':PD['title_x'],
'y':PD['title_y'], 'xanchor': 'center', 'yanchor': 'top'}, yaxis_title='Frequency',
plot_bgcolor= 'white', height = PD['height'],
legend=dict(orientation=PD['legend_orientation'], x=PD['legend_x'], y=PD['legend_y']),
legend_title_text= PD['legend_title'])
fig.update_traces(marker_line_color= PD['line_color'], marker_line_width=0.5, opacity=1)
fig['layout']['yaxis'].update(range=PD['ylims'])
fig.show()
Feat = 'Contrast'
Name = 'Contrast'
Table = Data[Feat].value_counts().to_frame('Count').reset_index(drop = False).rename(columns = {'index':Name})
Table['Percentage'] = np.round(100*(Table['Count']/Table['Count'].sum()),2)
Pull = [0 for x in range((len(Table[Name])-1))]
Pull.append(.05)
PD = dict(PieColors = ['RoyalBlue','DeepPink','LawnGreen'], TableColors = ['SlateGray','AliceBlue'], hole = .4,
column_widths=[0.5, 0.5],textfont = 14, height = 400, tablecolumnwidth = [.1, .05, .08],
pull = Pull, legend_title = Name, legend_orientation = 'v', title_x = 0.5, title_y = 0.85)
del Pull
DatasetDist(Table, Target = Name, PD = PD)
PD = dict(nbins = 20, Bar_Colors = ['RoyalBlue','DeepPink','LawnGreen'],
marginal='violin', line_color = 'Black', ylims = [0, 40],
Title = 'Age and Contrast', title_x = 0.5, title_y = .95, height = 450,
legend_title = 'Lesion', legend_orientation = 'h', legend_x = 0.01, legend_y = -0.2)
DistPlot('Age', Target = Feat, PD = PD)
We can convert the TIFF images to JPEG files and copy them into a new directory.
def ImgSep(Data, Target, NewDire, Img_Path, file_name, Convert_to_jpg = False):
# Creating a new directory
if os.path.exists(NewDire):
shutil.rmtree(NewDire)
# Creating sub-directories
for subfolder in Data[Target].unique().tolist():
if not os.path.exists(os.path.join(NewDire, subfolder)):
os.makedirs(os.path.join(NewDire, subfolder))
Counter = 0
Progress_Bar = progressbar.ProgressBar(maxval=Data.shape[0],
widgets=[progressbar.Bar('#', '|', '|'), progressbar.Percentage()])
Progress_Bar.start()
for _, row in Data.iterrows():
# from the current dir to a new one
if Convert_to_jpg:
cv2.imwrite(os.path.join(NewDire, row[Target], row[file_name].split('.')[0] + '.jpg'),
imread(row[Img_Path]), [int(cv2.IMWRITE_JPEG_QUALITY), 100])
else:
shutil.copy(row[Img_Path], os.path.join(NewDire, row[Target], row[file_name]))
Progress_Bar.update(Counter)
Counter+=1
Progress_Bar.finish()
Path = Base+'_mod'
ImgSep(Data = Data, Target = 'Contrast', NewDire = Path,
Img_Path = 'tiff Path', file_name = 'tiff_name', Convert_to_jpg = True)
|#########################################################################|100%
Now,
def Path_Tree(PATH, Extension):
Out = {}
sep = ' ' * 3
BACK = {'Black': Back.BLACK, 'Red':Back.RED, 'Green':Back.GREEN, 'Yellow': Back.YELLOW, 'Blue': Back.BLUE,
'Magenta':Back.MAGENTA, 'Cyan': Back.CYAN}
title = PATH.split('\\')[-1]
print(Style.RESET_ALL + Fore.BLUE + Style.NORMAL + '=' * (len(title) +1) + Style.RESET_ALL)
print(Back.BLACK + Fore.CYAN + Style.NORMAL + title+':'+ Style.RESET_ALL)
print(Style.RESET_ALL + Fore.BLUE + Style.NORMAL + '=' * (len(title) +1)+ Style.RESET_ALL)
i = 0
C = ['Red', 'Green', 'Yellow', 'Blue', 'Magenta', 'Cyan']*len(os.listdir(PATH))
for entry in os.listdir(PATH):
if os.path.isdir(os.path.join (PATH, entry)):
print('└──',BACK[C[i]] + Fore.BLACK + Style.NORMAL + entry+':'+ Style.RESET_ALL)
Sub = os.path.join (PATH, entry)
List = os.listdir(Sub)
List = [x for x in List if x.endswith(Extension)]
Out[entry] = List
print(2* sep, Fore.BLUE + Style.NORMAL +
'%i %s files:' % (len(List), List[0].split('.')[-1].upper()) + Style.RESET_ALL)
print(2* sep, ', '.join(List[:5]) + ', ...')
i+=1
return Out
_ = Path_Tree(Path, '.jpg')
====================== ct_medical_images_mod: ====================== └── False: 50 JPG files: ID_0050_AGE_0074_CONTRAST_0_CT.jpg, ID_0051_AGE_0063_CONTRAST_0_CT.jpg, ID_0052_AGE_0072_CONTRAST_0_CT.jpg, ID_0053_AGE_0073_CONTRAST_0_CT.jpg, ID_0054_AGE_0082_CONTRAST_0_CT.jpg, ... └── True: 50 JPG files: ID_0000_AGE_0060_CONTRAST_1_CT.jpg, ID_0001_AGE_0069_CONTRAST_1_CT.jpg, ID_0002_AGE_0074_CONTRAST_1_CT.jpg, ID_0003_AGE_0075_CONTRAST_1_CT.jpg, ID_0004_AGE_0056_CONTRAST_1_CT.jpg, ...
batch_size = 64
(Img_Height, Img_Width) = imread(Data['tiff Path'][0]).shape
train_ds = tf.keras.preprocessing.image_dataset_from_directory(directory= Path,
validation_split=0.2,
color_mode = 'grayscale',
subset="training",
seed=123, image_size=(Img_Height, Img_Width),
batch_size=batch_size)
val_ds = tf.keras.preprocessing.image_dataset_from_directory(directory= Path,
validation_split=0.2,
subset="validation",
color_mode = 'grayscale',
seed=123,
image_size=(Img_Height, Img_Width),
batch_size=batch_size)
Found 100 files belonging to 2 classes. Using 80 files for training. Found 100 files belonging to 2 classes. Using 20 files for validation.
fig, ax = plt.subplots(4, 4 , figsize = (16, 16))
_ = fig.suptitle('A Sample of Dataset', fontweight='bold', fontsize = 18)
ax = ax.ravel()
class_names = train_ds.class_names
for images, labels in train_ds.take(1):
for i in range(len(ax)):
_ = ax[i].imshow(images[i].numpy().astype("uint8"), cmap='bone')
_ = ax[i].set_title('Contrast: %s' % class_names[labels[i]],
fontweight='bold', fontsize = 12)
_ = ax[i].axis("off")
_ = ax[i].set_aspect(1)
fig.tight_layout()
A multi-layer perceptron (MLP) is a class of feedforward artificial neural network (ANN). The algorithm at each iteration uses the Cross-Entropy Loss to measure the loss, and then the gradient and the model update is calculated. At the end of this iterative process, we would reach a better level of agreement between test and predicted sets since the error would be lower from that of the first step.
Here, we have a small dataset that might result in Overfitting. Thus, we can define a Data augmentation function that generates additional training data from the existing examples by augmenting them using random transformations that yield believable-looking images.
data_augmentation = tf.keras.Sequential([layers.experimental.preprocessing.RandomFlip("horizontal",
input_shape=(Img_Height, Img_Width, 1)),
layers.experimental.preprocessing.RandomRotation(0.1),
layers.experimental.preprocessing.RandomZoom(0.1),])
fig, ax = plt.subplots(2, 4 , figsize = (16, 8))
_ = fig.suptitle('A Sample of Augmented Images', fontweight='bold', fontsize = 18)
ax = ax.ravel()
class_names = train_ds.class_names
for images, labels in train_ds.take(1):
for i in range(len(ax)):
augmented_images = data_augmentation(images)
_ = ax[i].imshow(augmented_images[0].numpy().astype("uint8"), cmap='bone')
_ = ax[i].set_title('Augmented Image %i' % (i+1),
fontweight='bold', fontsize = 11)
_ = ax[i].axis("off")
_ = ax[i].set_aspect(1)
fig.tight_layout()
num_classes = 2
model = models.Sequential([data_augmentation,
layers.experimental.preprocessing.Rescaling(1./255),
layers.Conv2D(16, 1, padding='same', activation='relu', input_shape=(Img_Height, Img_Width, 1)),
layers.MaxPooling2D(),
layers.Conv2D(32, 1, padding='same', activation='relu'),
layers.MaxPooling2D(),
layers.Conv2D(64, 1, padding='same', activation='relu'),
layers.MaxPooling2D(),
layers.Dropout(0.2),
layers.Flatten(),
layers.Dense(128, activation='relu'),
layers.Dense(num_classes)])
model.summary()
tf.keras.utils.plot_model(model, show_shapes=True, show_layer_names=True, expand_nested = True)
Model: "sequential_1" _________________________________________________________________ Layer (type) Output Shape Param # ================================================================= sequential (Sequential) (None, 512, 512, 1) 0 _________________________________________________________________ rescaling (Rescaling) (None, 512, 512, 1) 0 _________________________________________________________________ conv2d (Conv2D) (None, 512, 512, 16) 32 _________________________________________________________________ max_pooling2d (MaxPooling2D) (None, 256, 256, 16) 0 _________________________________________________________________ conv2d_1 (Conv2D) (None, 256, 256, 32) 544 _________________________________________________________________ max_pooling2d_1 (MaxPooling2 (None, 128, 128, 32) 0 _________________________________________________________________ conv2d_2 (Conv2D) (None, 128, 128, 64) 2112 _________________________________________________________________ max_pooling2d_2 (MaxPooling2 (None, 64, 64, 64) 0 _________________________________________________________________ dropout (Dropout) (None, 64, 64, 64) 0 _________________________________________________________________ flatten (Flatten) (None, 262144) 0 _________________________________________________________________ dense (Dense) (None, 128) 33554560 _________________________________________________________________ dense_1 (Dense) (None, 2) 258 ================================================================= Total params: 33,557,506 Trainable params: 33,557,506 Non-trainable params: 0 _________________________________________________________________
Compiling and fitting the model
# Number of iterations
IT = 41
model.compile(optimizer='adam', loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=['accuracy'])
# Training the model
history = model.fit(train_ds, validation_data=val_ds, epochs=IT, verbose = 0)
def Search_List(Key, List): return [s for s in List if Key in s]
Metrics_Names = {'loss':'Loss', 'accuracy':'Accuracy', 'mae':'MAE', 'mse':'MSE', 'recall': 'Recall'}
def Table_modify(df, Metrics_Names = Metrics_Names):
df = df.rename(columns = Metrics_Names)
df = df.reindex(sorted(df.columns), axis=1)
df.insert(loc = 0, column = 'Iteration', value = np.arange(0, df.shape[0]), allow_duplicates=False)
return df
Validation_Table = Search_List('val_',history.history.keys())
Train_Table = list(set( history.history.keys()) - set(Validation_Table))
Validation_Table = pd.DataFrame(np.array([history.history[x] for x in Validation_Table]).T, columns = Validation_Table)
Train_Table = pd.DataFrame(np.array([history.history[x] for x in Train_Table]).T, columns = Train_Table)
Validation_Table.columns = [x.replace('val_','') for x in Validation_Table.columns]
Train_Table = Table_modify(Train_Table)
Validation_Table = Table_modify(Validation_Table)
# Train Set Score
score = model.evaluate(train_ds, batch_size = batch_size, verbose = 0)
score = pd.DataFrame(score, index = model.metrics_names).T
score.index = ['Train Set Score']
# Validation Set Score
Temp = model.evaluate(val_ds, batch_size = batch_size, verbose = 0)
Temp = pd.DataFrame(Temp, index = model.metrics_names).T
Temp.index = ['Validation Set Score']
score = score.append(Temp)
score.rename(columns= Metrics_Names, inplace = True)
score = score.reindex(sorted(score.columns), axis=1)
display(score.style.set_precision(4))
| Accuracy | Loss | |
|---|---|---|
| Train Set Score | 0.9625 | 0.1792 |
| Validation Set Score | 0.9000 | 0.1498 |
def Plot_history(history, PD, Title = False, metrics_names = [x.title() for x in model.metrics_names]):
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, column_widths=[0.6, 0.4],
specs=[[{"type": "scatter"},{"type": "table"}]])
# Left
Colors = ['OrangeRed', 'MidnightBlue', 'purple']
for j in range(len(metrics_names)):
fig.add_trace(go.Scatter(x= history['Iteration'].values, y= history[metrics_names[j]].values,
line=dict(color=Colors[j], width= 1.5), name = metrics_names[j]), 1, 1)
fig.update_layout(legend=dict(x=0, y=1.1, traceorder='reversed', font_size=12),
dragmode='select', plot_bgcolor= 'white', height=600, hovermode='closest',
legend_orientation='h')
fig.update_xaxes(range=[history.Iteration.min(), history.Iteration.max()],
showgrid=True, gridwidth=1, gridcolor='Lightgray',
showline=True, linewidth=1, linecolor='Lightgray', mirror=True, row=1, col=1)
fig.update_yaxes(range=[0, PD['yLim']], showgrid=True, gridwidth=1, gridcolor='Lightgray',
showline=True, linewidth=1, linecolor='Lightgray', mirror=True, row=1, col=1)
# Right
if not PD['Table_Rows'] == None:
ind = np.linspace(0, history.shape[0], PD['Table_Rows'], endpoint = False).round(0).astype(int)
ind = np.append(ind, history.index[-1])
history = history[history.index.isin(ind)]
T = history.copy()
T[metrics_names] = T[metrics_names].applymap(lambda x: '%.4e' % x)
Temp = []
for i in T.columns:
Temp.append(T.loc[:,i].values)
TableColors = PD['TableColors']
fig.add_trace(go.Table(header=dict(values = list(history.columns), line_color=TableColors[0],
fill_color=TableColors[0], align=['center','center'], font=dict(color=TableColors[1], size=12), height=25),
columnwidth = PD['tablecolumnwidth'], cells=dict(values=Temp, line_color=TableColors[0],
fill=dict(color=[TableColors[1], TableColors[1]]),
align=['center', 'center'], font_size=12,height=20)), 1, 2)
if Title != False:
fig.update_layout(plot_bgcolor= 'white',
title={'text': Title, 'x':0.46, 'y':0.94, 'xanchor': 'center', 'yanchor': 'top'},
yaxis_title='Frequency')
fig.show()
PD = dict(Table_Rows = 25, yLim = 1.4, tablecolumnwidth = [0.3, 0.4, 0.4], TableColors = ['Navy','White'])
Plot_history(Train_Table, Title = 'Train Set', PD = PD)
Plot_history(Validation_Table, Title = 'Validation Set', PD = PD)